#!/usr/bin/env python
# coding=utf-8
# __author__ = 'Yunchao Ling'

prefix="/export/bigdata/literaturedata/pubmed_metadata/"
outfile=open(prefix+"pubmed_1_25416001_20141129.xml","w")

outfile.write('<?xml version="1.0"?>'+"\n")
outfile.write('<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2014//EN" "http://www.ncbi.nlm.nih.gov/corehtml/query/DTD/pubmed_140101.dtd">'+"\n")
outfile.write('<PubmedArticleSet>'+"\n")
for i in xrange(1,25410002,10000):
    filename="pubmed_"+str(i)+"_"+str(i+10000-1)+".xml"
    file=open(prefix+filename,"r")
    for line in file:
        line=line.rstrip()
        if not (line.startswith("<?xml") or line.startswith("<!DOCTYPE") or line.startswith("<PubmedArticleSet>") or line.startswith("</PubmedArticleSet>")):
            if not line=="":
                outfile.write(line+"\n")
                outfile.flush()
    file.close()
outfile.write('</PubmedArticleSet>'+"\n")
outfile.close()
